In this markdown document, we will demonstrate how you could visualize the different methods for clustering and dimensionality reduction, as stored in the data frame (the output from the clustering_dimensionalityreduction_pseudotime script/markdown)

As input we will use the deposited ‘df.csv’ file, which contains the expression values of 275856 CD4 T cells, HSNE-based clustres, flowsom clusters, phenograph clusters, umap coordinates, diffusion map coordinates and pseudotime values.

Load packages

library(dplyr)
library(ggplot2)
library(scales)
library(reshape2)
library(RColorBrewer)
library(ggrepel)

Load dataframe

##      FSC.A  FSC.H    FSC.W    SSC.A SSC.H    SSC.W        CD95         CD8
## 1 123725.3 114537 70793.39 29916.04 29725 65957.20  0.04919554 -0.04211360
## 2 129744.8 113527 74898.12 37344.06 35011 69903.19  0.06008087 -0.07160002
## 3 134425.1 122187 72100.02 29106.00 28719 66419.13 -0.08300304 -0.05722039
## 4 129715.4 117286 72481.21 38762.57 36082 70404.73  0.02296851 -0.22416283
## 5 135594.0 118113 75235.51 36312.43 34373 69233.75 -0.06238258  0.07444189
## 6 109618.7  98183 73169.21 24938.76 24099 67819.69  0.18458755  0.09685653
##       CD27      CXCR4     CCR7 LIVE.DEAD      CD4   CD45RA      CD3       CD49b
## 1 2.043456 -154.76010 1.617012 418.72021 1.115082 1.509235 2.494015 -0.15329744
## 2 1.899140  -55.10397 1.295678  62.95781 1.473094 1.649236 2.228497  0.13627116
## 3 2.095128 -138.39290 1.184099 773.71948 1.358547 1.497685 2.266846  0.04310955
## 4 1.978630   66.90539 1.450845 194.00330 1.453925 1.691411 2.390522  0.09496491
## 5 2.137328   88.51750 1.517236 244.99280 1.276686 1.428678 2.169388 -0.07254910
## 6 2.151697  -25.66885 1.486741 778.22522 1.317697 1.639596 2.038711 -0.06887352
##      CD14.19         CD69       CD103   TIME    CD95_150     CD8_150 CD27_150
## 1   85.44810  0.085233651  0.02757605 1759.5  0.17108983 -0.08468185 2.585516
## 2   98.47850  0.015498741  0.07988916  142.0  0.20849496 -0.11978610 2.438651
## 3 -336.29279  0.004213435  0.03451059 7317.7 -0.28636056 -0.09378094 2.640524
## 4  -98.21554 -0.022149587 -0.09928628 9650.9  0.08015787 -0.38231894 2.520807
## 5 -173.81610  0.115474455  0.03658240 5833.8 -0.21637227  0.11705446 2.681148
## 6  109.13980  0.026784593 -0.06232975 7206.4  0.60996115  0.18435949 2.700062
##   CCR7_150 CD45RA_150  CD3_150  CD49b_150  CD4_150    CD69_150   CD103_150
## 1 4.138186   2.714379 3.960085 -0.5831565 2.831010  0.45620728  0.09253515
## 2 3.694022   2.865701 3.726255  0.1908505 3.173416  0.08565834  0.26558015
## 3 3.619215   2.701740 3.762954  0.2957661 3.110011  0.02331229  0.11572015
## 4 3.983945   2.910694 3.889792  0.4169627 3.220305 -0.12226621 -0.32822508
## 5 4.009308   2.625654 3.622586 -0.3594562 3.012274  0.60314041  0.12263663
## 6 3.731990   2.855382 3.526417 -0.6445128 3.026464  0.14768833 -0.20805925
##   CSPLR_ST CSPLR_HsneDataX CSPLR_HsneDataY clusters_HSNE sampleID
## 1        6      -0.2343911        12.55609         CD4-1    CD3_W
## 2        0     -11.1102962       -21.56930         CD4-1    CD3_B
## 3        3      -0.2343911        12.55609         CD4-1    CD3_N
## 4        3      -0.2343911        12.55609         CD4-1    CD3_N
## 5        6      -0.2343911        12.55609         CD4-1    CD3_W
## 6        2       3.3519292        20.76395         CD4-1    CD3_R
##   clusters_flowsom         DC1          DC2          DC3   umap_1    umap_2
## 1                5 0.001300343 0.0005537734 0.0003425180 3.810218 1.3263930
## 2                5 0.001370561 0.0003254344 0.0002585017 3.800768 0.8864915
## 3                2 0.001196715 0.0005467035 0.0003490013 3.750142 1.1966473
## 4                5 0.001471539 0.0003559209 0.0002495618 4.043976 1.0863283
## 5                2 0.001181110 0.0006063208 0.0003737570 3.738265 1.2708768
## 6                5 0.001446349 0.0004158300 0.0002839620 4.020594 1.0154871
##   merged_HSNE   curve1   curve2   curve3 clusters_phenograph
## 1       Naive 3.547367 3.563246 3.579921                  10
## 2       Naive 3.497071 3.512359 3.531119                  10
## 3       Naive 3.603737 3.620266 3.639434                  10
## 4       Naive 3.433274 3.448976 3.464703                  10
## 5       Naive 3.610096 3.627680 3.644634                  10
## 6       Naive 3.516714 3.529806 3.543224                  10

Visualization

Visualize parameters on diffusion map

Define a function for visualization of the diffusion map

viz.dm <- function(dat,dr, param.name,limits=NULL){
  ColVal <- dat[,param.name]
  if(is.null(limits)){
    Lim <- quantile(ColVal,probs=seq(0,1,0.01))[c(2,100)]
    p <- ggplot(dat, aes(x = DC1, y =DC2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=Lim, oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
  } else {
    p <- ggplot(dat, aes(x = DC1, y = DC2)) +geom_point(aes(color = ColVal), size=0.1)+theme_classic()+scale_color_distiller(name=param.name, palette = "RdYlBu", limits=c(limits[1],limits[2]), oob=squish)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ggtitle(param.name)
  }
  p
}

Define the parameters for which you would like generate plots

First check the order of columns

colnames(df)
##  [1] "FSC.A"               "FSC.H"               "FSC.W"              
##  [4] "SSC.A"               "SSC.H"               "SSC.W"              
##  [7] "CD95"                "CD8"                 "CD27"               
## [10] "CXCR4"               "CCR7"                "LIVE.DEAD"          
## [13] "CD4"                 "CD45RA"              "CD3"                
## [16] "CD49b"               "CD14.19"             "CD69"               
## [19] "CD103"               "TIME"                "CD95_150"           
## [22] "CD8_150"             "CD27_150"            "CCR7_150"           
## [25] "CD45RA_150"          "CD3_150"             "CD49b_150"          
## [28] "CD4_150"             "CD69_150"            "CD103_150"          
## [31] "CSPLR_ST"            "CSPLR_HsneDataX"     "CSPLR_HsneDataY"    
## [34] "clusters_HSNE"       "sampleID"            "clusters_flowsom"   
## [37] "DC1"                 "DC2"                 "DC3"                
## [40] "umap_1"              "umap_2"              "merged_HSNE"        
## [43] "curve1"              "curve2"              "curve3"             
## [46] "clusters_phenograph"

Provide the column numbers

parameters = colnames(df)[c(7:9, 11, 13:16,18,19)]

Visualize the parameters on the diffusion map

for (i in 1:length(parameters)){
  p=viz.dm(dat=df, param.name = parameters[i])
  print(p)
}

If you are not satisfied with the color scale, adjust the limits argument

viz.dm(dat=df,param.name='CD8', limits=c(-0.36,4.32))

viz.dm(dat=df, param.name='CD103', limits=c(-0.38, 3.16))

visualize clusters on diffusion map

label_HSNE_dm <- df%>%group_by(clusters_HSNE)%>%select(DC1, DC2)%>%summarize_all(mean)
label_flowsom_dm <- df%>%group_by(clusters_flowsom)%>%select(DC1, DC2)%>%summarize_all(mean)
label_pheno_dm <- df%>%group_by(clusters_phenograph)%>%select(DC1, DC2)%>%summarize_all(mean)

ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_HSNE)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_HSNE), data=label_HSNE_dm)+guides(colour=FALSE)+ggtitle('clusters HSNE')

ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_flowsom)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_flowsom), data=label_flowsom_dm)+guides(colour=FALSE)+ggtitle('clusters flowsom')

ggplot(df, aes(x=DC1, y=DC2, color=as.factor(clusters_phenograph)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_phenograph), data=label_pheno_dm)+guides(colour=FALSE)+ggtitle('clusters phenograph')

visualize parameters on umap

Define a function for visualization of the umap

Visualize the parameters on the umap

for (i in 1:length(parameters)){
  p=viz.umap(dat=df, param.name = parameters[i])
  print(p)
}

If you are not satisfied with the color scale, adjust the limits argument

viz.umap(dat=df,param.name='CD8', limits=c(-0.36,4.32))

viz.umap(dat=df, param.name='CD103', limits=c(-0.38, 3.16))

visualize clusters on umap

label_HSNE_umap <- df%>%group_by(clusters_HSNE)%>%select(umap_1, umap_2)%>%summarize_all(mean)
label_flowsom_umap <- df%>%group_by(clusters_flowsom)%>%select(umap_1, umap_2)%>%summarize_all(mean)
label_pheno_umap <- df%>%group_by(clusters_phenograph)%>%select(umap_1, umap_2)%>%summarize_all(mean)

ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_HSNE)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_HSNE), data=label_HSNE_umap)+guides(colour=FALSE)+ggtitle('clusters HSNE')

ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_flowsom)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_flowsom), data=label_flowsom_umap)+guides(colour=FALSE)+ggtitle('clusters flowsom')

ggplot(df, aes(x=umap_1, y=umap_2, color=as.factor(clusters_phenograph)))+geom_point(size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+geom_label_repel(aes(label=clusters_phenograph), data=label_pheno_umap)+guides(colour=FALSE)+ggtitle('clusters phenograph')

Frequencies

Fill in for which clusters you would like to calculate the frequencies (either clusters_HSNE, clusters_flowsom, or clusters_phenograph)

counts <- as.data.frame.matrix(table(df$sampleID,df$clusters_HSNE))

Calculate the percentage of a certain sample present in a certain cluster

counts_percofsample = counts/rowSums(counts)*100
counts_percofsample$total <- rowSums(counts)
counts_percofsample$sampleID <- row.names(counts_percofsample)
counts_percofsample <- melt(counts_percofsample,  id.vars=c('sampleID', 'total'), variable.name='cluster', value.name='frequency')

Plot the frequencies in a boxplot (frequency = % of sample in cluster)

ggplot(counts_percofsample, aes(x=reorder(cluster, frequency, FUN=median), y=frequency))+ geom_boxplot(position='dodge2', outlier.shape=NA)+geom_jitter(aes(colour=sampleID), size=2)+xlab('cluster')

Plot the frequencies in bargraphs (frequency = % of sample in cluster)

Visualize pseudotime

If you would like to use other clusters, adjust ‘merged_HSNE’ to clusters_flowsom’, for instance

If there are more or less lineages identified adjust the number of curve columns

pseudotimevalues <- df%>%select(c(DC1, DC2,clusterID=merged_HSNE, curve1, curve2, curve3))

Preparation of table and color palette for visualization of lineages

#reshape table 
pseudotimevalues <- melt(pseudotimevalues, id.vars=c('clusterID', 'DC1', 'DC2'), variable.name='lineage', value.name = 'pseudotime')

#rename curve to lineage
pseudotimevalues$lineage <- gsub('curve','lineage', pseudotimevalues$lineage)

#exclude cells with NA pseudotimevalues (those cells are not present in all lineages and have NA values for the lineages in which they are absent)
pseudotimevaluesexclNA <- pseudotimevalues%>%filter(pseudotime!='NA')

#generate colorpalette
colors <- colorRampPalette(rev(brewer.pal(11, 'Spectral'))[-6])(100)

Plot the lineages: cells are colored by pseudotime

ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=DC1, y=DC2))+geom_point(aes(color=pseudotime),size=0.1, alpha=0.3) +facet_wrap(~lineage)+scale_color_gradientn(colours=colors)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())

Plot the lineages: cells are colored by clusterID

ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=DC1, y=DC2))+geom_point(aes(color=clusterID),size =0.1)+facet_wrap(~lineage)+theme_bw()+ guides(colour = guide_legend(override.aes = list(size=5)))

Jitterplot per lineage: cells are colored by clusterID

ggplot(pseudotimevaluesexclNA%>%arrange(pseudotime), aes(x=pseudotime, y=lineage)) + geom_jitter(aes(color=clusterID), size=0.1)+theme_bw()+theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank())+ guides(colour = guide_legend(override.aes = list(size=5)))